{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "matplotlib.get_backend :  module://ipykernel.pylab.backend_inline\n"
     ]
    }
   ],
   "source": [
    "from build_predictions import * \n",
    "from GaussRank import GaussRankMap\n",
    "import pandas as pd \n",
    "from create_parquet import *\n",
    "from data import *\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_DIR='/rapids/notebooks/srabhi/champs-2019/input/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['1JHC', '2JHC', '3JHC', '1JHN', '2JHN', '3JHN', '2JHH', '3JHH']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "COUPLING_TYPE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# node frame \n",
    "molecule_node = pd.read_parquet(DATA_DIR+'parquet/molecule_node.parquet')\n",
    "\n",
    "# edge frame \n",
    "molecule_edge = pd.read_parquet(DATA_DIR+'parquet/molecule_edge.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Get the three frames needed for building train / validation oupling stacking "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build Train / validation gaussrank value for each fold "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from time import time\n",
    "def save_cv_data(type_, fold):\n",
    "    print('type %s : %s' %(type_, COUPLING_TYPE[type_]))\n",
    "    # coupling frame of the type \n",
    "    coupling_frame = pd.read_csv(DATA_DIR+'parquet/baseline_coupling_frame.csv')\n",
    "    cols = ['molecule_name', 'num_coupling', 'coupling_dim', 'atom_index_0', 'atom_index_1', 'coupling_type', 'scalar_coupling',\n",
    "           'fc', 'sd', 'pso', 'dso', 'id']\n",
    "    coupling_frame = coupling_frame[cols]\n",
    "    coupling_frame = coupling_frame[coupling_frame.coupling_type == type_]\n",
    "    new_num_coupling = dict(coupling_frame.groupby('molecule_name').count()['num_coupling'])\n",
    "    coupling_frame.num_coupling = coupling_frame.molecule_name.map(new_num_coupling)\n",
    "\n",
    "    # shortest path\n",
    "    shortest_path_frame = pd.read_csv('/rapids/notebooks/srabhi/champs-2019/input/shortest_path.csv')\n",
    "    cols = [ 'id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'path_index_0', 'path_index_1',\n",
    "           'path_index_2', 'path_index_3', 'path_btype_0', 'path_btype_1',\n",
    "           'path_btype_2', 'path_a_num_0', 'path_a_num_1', 'path_a_num_2',\n",
    "           'path_a_num_3']\n",
    "    shortest_path_frame = shortest_path_frame[cols]\n",
    "    coupling_frame = pd.merge(coupling_frame, shortest_path_frame, on=['id', 'molecule_name', 'atom_index_0', 'atom_index_1'], how='left')\n",
    "    print(coupling_frame.shape)\n",
    "    max_ = coupling_frame.num_coupling.max()\n",
    "    COUPLING_MAX = max_\n",
    "    print('max coupling: %s' %max_)\n",
    "    print('fold: %s' %fold)\n",
    "    split_train = 'train_split_by_mol_hash.%s.npy'%fold\n",
    "    split_valid = 'valid_split_by_mol_hash.%s.npy'%fold\n",
    "    id_train_ = np.load(DATA_DIR + '/split/%s'%split_train,allow_pickle=True)\n",
    "    id_valid_ = np.load(DATA_DIR + '/split/%s'%split_valid,allow_pickle=True)\n",
    "    csv = 'test'\n",
    "    df = pd.read_csv(DATA_DIR + '/csv/%s.csv'%csv)\n",
    "    id_test_ = df.molecule_name.unique()\n",
    "    \n",
    "    train = coupling_frame[coupling_frame.molecule_name.isin(id_train_)]\n",
    "    validation = coupling_frame[coupling_frame.molecule_name.isin(id_valid_)]\n",
    "    test = coupling_frame[coupling_frame.molecule_name.isin(id_test_)]\n",
    "    \n",
    "    # Get GaussRank of coupling values \n",
    "    t0 = time()\n",
    "    grm = GaussRankMap()\n",
    "    df_train = train[['coupling_type', 'scalar_coupling']]\n",
    "    df_valid = validation[['coupling_type', 'scalar_coupling']]\n",
    "    df_train.columns = ['type', 'scalar_coupling_constant']\n",
    "    df_valid.columns = ['type', 'scalar_coupling_constant']\n",
    "    # Reverse type mapping \n",
    "    df_train.type = df_train.type.map(REVERSE_COUPLING_TYPE)\n",
    "    df_valid.type = df_valid.type.map(REVERSE_COUPLING_TYPE)\n",
    "    #fit grm \n",
    "    transformed_training = grm.fit_training(df_train, reset=True)\n",
    "    transformed_validation = grm.convert_df(df_valid, from_coupling=True)\n",
    "    validation['gaussrank_coupling'] =  transformed_validation\n",
    "    train['gaussrank_coupling'] = transformed_training\n",
    "    print('Getting gaussrank transformation for train/validation data took %s seconds' %(time()-t0))\n",
    "    print(grm.coupling_order)\n",
    "    test['gaussrank_coupling'] = 0 \n",
    "\n",
    "    general_coupling_frame = pd.concat([train, validation, test.fillna(0.0)])\n",
    "\n",
    "    # Build molecule coupling frame for fold \n",
    "    coupling_cols = ['atom_index_0', 'atom_index_1', 'coupling_type', 'scalar_coupling', 'gaussrank_coupling',\n",
    "                    'fc', 'sd', 'pso', 'dso', 'id',\n",
    "                    'path_index_0', 'path_index_1', 'path_index_2','path_index_3', \n",
    "                    'path_btype_0', 'path_btype_1', 'path_btype_2',\n",
    "                    'path_a_num_0', 'path_a_num_1', 'path_a_num_2', 'path_a_num_3']\n",
    "    \n",
    "    shared_cols = ['molecule_name', 'num_coupling', 'coupling_dim']\n",
    "\n",
    "    tmp = general_coupling_frame.groupby('molecule_name').apply(lambda x: x[coupling_cols].values.reshape(-1))\n",
    "    molecule_coupling = pd.DataFrame(tmp.values.tolist()).fillna(0.0)\n",
    "    molecule_coupling['molecule_name'] = tmp.index\n",
    "    molecule_coupling = molecule_coupling.merge(general_coupling_frame[shared_cols].drop_duplicates(), on='molecule_name', how='left')\n",
    "\n",
    "    cols = molecule_coupling.columns.tolist()\n",
    "\n",
    "    new_cols = cols[-3:] + cols[:-3]\n",
    "    molecule_coupling = molecule_coupling[new_cols]\n",
    "    print(molecule_coupling.shape)\n",
    "    molecule_coupling.columns = ['molecule_name', 'num_coupling', 'coupling_dim'] + ['coupling_%s'%i for i in range(COUPLING_MAX*21)]\n",
    "\n",
    "\n",
    "    node_edge_frame = pd.merge(molecule_node, molecule_edge, on='molecule_name', how='left')\n",
    "    general_stack_frame =  pd.merge(node_edge_frame, molecule_coupling, on='molecule_name', how='inner')\n",
    "\n",
    "    train_frame = general_stack_frame[general_stack_frame.molecule_name.isin(id_train_)]\n",
    "    validation_frame = general_stack_frame[general_stack_frame.molecule_name.isin(id_valid_)]\n",
    "    test_frame = general_stack_frame[general_stack_frame.molecule_name.isin(id_test_)]\n",
    "\n",
    "    \n",
    "    type_str = COUPLING_TYPE[type_]\n",
    "    os.makedirs(DATA_DIR+'rnn_parquet/fold_%s'%fold+'/%s'%type_str, exist_ok=True)\n",
    "    \n",
    "    validation_frame.to_parquet(DATA_DIR+'rnn_parquet/fold_%s'%fold+'/%s/validation.parquet'%type_str)\n",
    "    train_frame.to_parquet(DATA_DIR +'rnn_parquet/fold_%s'%fold+ '/%s/train.parquet'%type_str)\n",
    "    # save mapping\n",
    "    for i, (str_type_, frame) in enumerate(zip(grm.coupling_order, grm.training_maps)): \n",
    "        frame.to_csv(DATA_DIR +'rnn_parquet/fold_%s'%fold+'/%s/mapping_%s_order_%s.csv'%(str_type_, str_type_, i), index=False)\n",
    "    return test_frame\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "COUPLING_MAX_DICT = {'1JHC': 20, '2JHC': 36, '3JHC': 66, '1JHN': 8, '2JHN': 12, '3JHN': 18, '3JHH': 36, '2JHH': 19 }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "type 7 : 3JHH\n",
      "(908046, 23)\n",
      "max coupling: 36\n",
      "fold: 0\n",
      "Getting gaussrank transformation for train/validation data took 0.636885404586792 seconds\n",
      "['3JHH']\n",
      "(118341, 759)\n",
      "type 7 : 3JHH\n",
      "(908046, 23)\n",
      "max coupling: 36\n",
      "fold: 1\n",
      "Getting gaussrank transformation for train/validation data took 0.6745212078094482 seconds\n",
      "['3JHH']\n",
      "(118341, 759)\n",
      "type 7 : 3JHH\n",
      "(908046, 23)\n",
      "max coupling: 36\n",
      "fold: 2\n",
      "Getting gaussrank transformation for train/validation data took 0.6809020042419434 seconds\n",
      "['3JHH']\n",
      "(118341, 759)\n",
      "type 7 : 3JHH\n",
      "(908046, 23)\n",
      "max coupling: 36\n",
      "fold: 3\n",
      "Getting gaussrank transformation for train/validation data took 0.6988716125488281 seconds\n",
      "['3JHH']\n",
      "(118341, 759)\n"
     ]
    }
   ],
   "source": [
    "for type_ in range(8)\n",
    "    for fold in range(4):\n",
    "        test_frame = save_cv_data(type_, fold) \n",
    "        test_frame.to_parquet(DATA_DIR +'/rnn_parquet/test_%s.parquet'%COUPLING_TYPE[type_])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
